*********************************************
* Title: rwanda_jde_tableA10-A11.do
* Author: Todd Pugatch
* Last update: June 4 2024
* Description: analysis for Blimpo and Pugatch, "Entrepreneurship Education
*	and Teacher Training in Rwanda," Stage 2 Registered report, Journal of 
*	Development Economics
* Inputs: 	student_merge_jde.dta
*			rwanda_school_trainattend_jde.dta
* Outputs: 	rwanda_jde_tableA10-A11.txt
*			rwanda_jde_tableA10-A11_t[4-5].out
* Notes: produces Tables A10-A11
**********************************************

local start=`"$S_TIME"'
clear
clear matrix
clear mata
graph drop _all
program drop _all
cap log close
set more off

* Set directories 
*global main "[SET MAIN DIRECTORY HERE]"
	global rawdata "$main/01_data/01_raw"
	global cleandata "$main/01_data/02_clean"
	global dofiles "$main/02_dofiles"
	global results "$main/03_results"
	global temp "$main/04_output"

* begin log file
log using "$temp/rwanda_jde_tableA10-A11.txt", text replace

* data prep: merge take-up with student surveys, by school
local takeup "training_any_pct exchange_pct"
qui use "$cleandata/student_merge_jde.dta", clear
qui merge m:1 school_code using "$cleandata/rwanda_school_trainattend_jde.dta", keepusing(`takeup')
qui drop if _merge==2
drop _merge
qui save "$temp/studenttemp.dta", replace	

*******************************************
* SET UP MULTIPLE HYPOTHESIS CORRECTION
/*Proceed in 3 steps:
	1. Run all regressions for H1. Save coefficients & std. errors.
	2. Calculate q-values for all hypotheses.
	3. Re-run regressions to get output, including q-values.*/
*******************************************

* 1. GET COEF/SE/P FOR ALL REGRESSIONS	
* ACADEMIC OUTCOMES
/*Goal: produce columns 1-3 of Table P4
	Uses endline student survey.
	NOTE: still awaiting administrative data on exam performance to produce columns 1-2.
	Regress academic outcome on treatment status, controlling
		for randomization strata. 
	Cluster s.e.'s by school.*/
qui use "$temp/studenttemp.dta", clear

/*get non-normalized control group mean test scores, per request of Della Vigna et al
	team at Berkeley gathering predictions on project outcomes*/
su entrep_grade_el if treatment==0
	
* normalize exam scores to control mean
/*careful because distribution of S6 entrepreneurship exam score is discrete (0-6)*/
foreach y in lastpromexam_el entrep_grade_el S3_exam_bl {
	qui su `y' if treatment==0
	qui gen z`y'=(`y'-r(mean))/r(sd)
}
lab var zentrep_grade_el "S6 entrepreneurship exam score (z)"
lab var zlastpromexam_el "S4/S5 promotional exam score (z)"
lab var zS3_exam_bl "S3 exam score (z)"

* prep missing values for students without baseline outcome
qui gen zS3_exam_bli=zS3_exam_bl
qui su zS3_exam_bl if treatment==0
qui replace zS3_exam_bli=r(mean) if zS3_exam_bl==.	
qui gen S3_exam_blm=(S3_exam_bl==.)
lab var zS3_exam_bli "zS3_exam_bl, imputing control mean for missing values"
lab var S3_exam_blm "missing value for S3_exam_bl"

/*Columns 1-3: exam scores*/
/*Total score (col 1) unavailable*/
local Y "zentrep_grade_el zlastpromexam_el"
local Y0 "zS3_exam_bli S3_exam_blm"
local i=2
foreach y in `Y' {
	qui ivregress 2sls `y' (exchange_pct=treatment) `Y0' i.strata, cluster(school_code)
	scalar define H3_t4_c`i'b=_b[exchange_pct]
	scalar define H3_t4_c`i'se=_se[exchange_pct]
	scalar define t=_b[exchange_pct]/_se[exchange_pct]
	scalar define dfr=e(N)-e(df_m)
	scalar define H3_t4_c`i'p=2*ttail(dfr,abs(t))
	local i=`i'+1
}

* ENTREPRENEURSHIP SKILLS
/*Goal: produce columns 4-9 of Table P4
	Uses endline student survey.
	Regress outcome on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.*/

* prep missing values for students without baseline outcome
local Y0 "wait_10k_bl wait_20k_bl compound_interest_bl anysavings_bl savings_less5k_bl savings_5kto10k_bl savings_more10k_bl eknowledge_index_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

* prep continuous financial variables: winsorize at 99th percentile
winsor savings_usd_cond_el, gen(savings_usd_condw_el) p(0.01) highonly
lab var savings_usd_condw_el "savings_usd_cond_el, winsorized at 99th percentile"

* set up control variables, by table column
local Y04 "wait_10k_bli wait_10k_blm"
local Y05 "wait_20k_bli wait_20k_blm"
local Y06 "compound_interest_bli compound_interest_blm"
local Y07 "anysavings_bli anysavings_blm"
local Y08 "savings_less5k_bli savings_5kto10k_bli savings_more10k_bli savings_less5k_blm savings_5kto10k_blm savings_more10k_blm"
local Y09 "eknowledge_index_bli eknowledge_index_blm"

/*regressions: Columns 4-9*/
local Y "wait_10k wait_20k compound_interest anysavings	savings_usd_condw eknowledge_index"
local i=4 /*initialize loop*/
foreach y in `Y' {
	qui ivregress 2sls `y'_el (exchange_pct=treatment) `Y0`i'' i.strata, cluster(school_code)
	scalar define H3_t4_c`i'b=_b[exchange_pct]
	scalar define H3_t4_c`i'se=_se[exchange_pct]
	scalar define t=_b[exchange_pct]/_se[exchange_pct]
	scalar define dfr=e(N)-e(df_m)
	scalar define H3_t4_c`i'p=2*ttail(dfr,abs(t))
	local i=`i'+1
}

* NON-COGNITIVE SKILLS
/*Goal: produce columns 1-5 of Table P5
	Uses endline student survey.
	Regress non-cognitive skill on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.*/
* prep missing values for students without baseline outcome
local Y0 "planned_schl_postsec_bl planned_occup_busorpro_bl planned_business_bl	control_avg_bl grit_raw_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

local Y "planned_schl_postsec planned_occup_busorpro planned_business control_avg grit_raw"
local j=1 /*initialize loop*/
foreach y in `Y' {
	qui ivregress 2sls `y'_el (exchange_pct=treatment) `y'_bli `y'_blm i.strata, cluster(school_code)
	scalar define H3_t5_c`j'b=_b[exchange_pct]
	scalar define H3_t5_c`j'se=_se[exchange_pct]
	scalar define t=_b[exchange_pct]/_se[exchange_pct]
	scalar define dfr=e(N)-e(df_m)
	scalar define H3_t5_c`j'p=2*ttail(dfr,abs(t))
	local j=`j'+1
}

* 2. CALCULATE Q-VALUES FOR ALL HYPOTHESES
/*Calculate sharpened q-values from Benjamini, Krieger, Yuketeli (2006).
	Use code from fdr_sharpened_qvalues.do from Anderson (2008), available at 
		https://are.berkeley.edu/~mlanderson/ARE_Website/Research.html*/
* prep data: put p-values from above into a dataset
clear
local I=`i'-1 /*#cols in Table 4*/
local J=`j'-1 /*#cols in Table 5*/
local n=`I'+`J'
qui set obs `n'
qui gen hypothesis=3
qui gen table=4
local t5start=`I'+1
qui replace table=5 in `t5start'/`n'
qui bysort table: egen column=seq()
foreach x in b se pval {
	qui gen double `x'=.
}
forval i=2/`I' { /*start at col 2 based on data availability*/
	qui replace b=H3_t4_c`i'b if table==4 & column==`i'
	qui replace se=H3_t4_c`i'se if table==4 & column==`i'
	qui replace pval=H3_t4_c`i'p if table==4 & column==`i' 
}
forval j=1/`J' {
	qui replace b=H3_t5_c`j'b if table==5 & column==`j'
	qui replace se=H3_t5_c`j'se if table==5 & column==`j'
	qui replace pval=H3_t5_c`j'p if table==5 & column==`j' 
}

sort table column

* START OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
* Collect the total number of p-values tested
quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui gen bky06_qval = 1 if pval~=.
	
* Set the initial counter to 1 

local qval = 1

/* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, 
	then checks which hypotheses are rejected at q = 0.999, 
	then checks which hypotheses are rejected at q = 0.998, etc.  
	The loop ends by checking which hypotheses are rejected at q = 0.001. */
while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	
	* Generate value q'*r/M
	qui gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q'*r/M
	qui gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank1 = reject_temp1*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	
	* Generate value q_2st*r/M
	qui gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	qui gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank2 = reject_temp2*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	qui replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}

quietly sort original_sorting_order

******************************
* END OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
******************************
* list results
list hypothesis table column b se pval bky06_qval

* prep results for inclusion in final regression output
sort table column
forval i=2/`I' {
	scalar define H3_t4_c`i'q=bky06_qval in `i'
}
local j=1
forval k=`t5start'/`n' {
	scalar define H3_t5_c`j'q=bky06_qval in `k'
	local j=`j'+1
}

* 3. RE-RUN REGRESSIONS TO GET OUTPUT, INCLUDING Q-VALUES
* ACADEMIC OUTCOMES
/*Goal: produce columns 1-3 of Table P4
	Uses endline student survey.
	NOTE: still awaiting administrative data on exam performance to produce columns 1-2.
	Regress academic outcome on treatment status, controlling
		for randomization strata. 
	Cluster s.e.'s by school.
	Include sharpened q-values calculated above.*/
qui use "$temp/studenttemp.dta", clear

* normalize exam scores to control mean
/*careful because distribution of S6 entrepreneurship exam score is discrete (0-6)*/
foreach y in lastpromexam_el entrep_grade_el S3_exam_bl {
	qui su `y' if treatment==0
	qui gen z`y'=(`y'-r(mean))/r(sd)
}
lab var zentrep_grade_el "S6 entrepreneurship exam score (z)"
lab var zlastpromexam_el "S4/S5 promotional exam score (z)"
lab var zS3_exam_bl "S3 exam score (z)"

* prep missing values for students without baseline outcome
qui gen zS3_exam_bli=zS3_exam_bl
qui su zS3_exam_bl if treatment==0
qui replace zS3_exam_bli=r(mean) if zS3_exam_bl==.	
qui gen S3_exam_blm=(S3_exam_bl==.)
lab var zS3_exam_bli "zS3_exam_bl, imputing control mean for missing values"
lab var S3_exam_blm "missing value for S3_exam_bl"

/*Columns 1-3: exam scores*/
/*Total score (col 1) unavailable*/
local stat ""1st stage F",F,"control mean",mu_c,"baseline mean",mu_0,"q-value",qv"
local Y "zentrep_grade_el zlastpromexam_el"
local Y0 "zS3_exam_bli S3_exam_blm"

local i=2

* zentrep_grade_el
qui ivregress 2sls zentrep_grade_el (exchange_pct=treatment) `Y0' i.strata, cluster(school_code)
qui estat first
matrix define X=r(singleresults)
scalar define F=X[1,4]
qui su zentrep_grade_el if treatment==0 & e(sample)
scalar define mu_c=r(mean)
qui su zS3_exam_bl if e(sample)
scalar define mu_0=r(mean)
scalar define qv=H3_t4_c`i'q
outreg2 exchange_pct using "$results/rwanda_jde_tableA10-A11_t4.xls", se excel nolabel nocons addstat(`stat') replace
local i=`i'+1
	
* zlastpromexam_el
qui ivregress 2sls zlastpromexam_el (exchange_pct=treatment) `Y0' i.strata, cluster(school_code)
qui estat first
matrix define X=r(singleresults)
scalar define F=X[1,4]
qui su zlastpromexam_el if treatment==0 & e(sample)
scalar define mu_c=r(mean)
qui su zS3_exam_bl if e(sample)
scalar define mu_0=r(mean)
scalar define qv=H3_t4_c`i'q
outreg2 exchange_pct using "$results/rwanda_jde_tableA10-A11_t4.xls", se excel nolabel nocons addstat(`stat')  append
local i=`i'+1


* ENTREPRENEURSHIP SKILLS
/*Goal: produce columns 4-9 of Table P4
	Uses endline student survey.
	Regress outcome on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.
	Include sharpened q-values calculated above.*/

* prep missing values for students without baseline outcome
local Y0 "wait_10k_bl wait_20k_bl compound_interest_bl anysavings_bl savings_less5k_bl savings_5kto10k_bl savings_more10k_bl eknowledge_index_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

* prep continuous financial variables: winsorize at 99th percentile
winsor savings_usd_cond_el, gen(savings_usd_condw_el) p(0.01) highonly
lab var savings_usd_condw_el "savings_usd_cond_el, winsorized at 99th percentile"

* set up control variables, by table column
local Y04 "wait_10k_bli wait_10k_blm"
local Y05 "wait_20k_bli wait_20k_blm"
local Y06 "compound_interest_bli compound_interest_blm"
local Y07 "anysavings_bli anysavings_blm"
local Y08 "savings_less5k_bli savings_5kto10k_bli savings_more10k_bli savings_less5k_blm savings_5kto10k_blm savings_more10k_blm"
local Y09 "eknowledge_index_bli eknowledge_index_blm"

/*regressions: Columns 4-9*/
local Y "wait_10k wait_20k compound_interest anysavings	savings_usd_condw eknowledge_index"
local i=4 /*initialize loop*/
foreach y in `Y' {
	qui ivregress 2sls `y'_el (exchange_pct=treatment) `Y0`i'' i.strata, cluster(school_code)
	qui estat first
	matrix define X=r(singleresults)
	scalar define F=X[1,4]
	qui su `y'_el if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	if ("`y'"!="savings_usd_condw") {/*baseline outcome for column 8 is a vector, so omit from report*/
		qui su `y'_bl if e(sample)
		scalar define mu_0=r(mean)
	}
	scalar define qv=H3_t4_c`i'q
	outreg2 exchange_pct using "$results/rwanda_jde_tableA10-A11_t4.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}

* NON-COGNITIVE SKILLS
/*Goal: produce columns 1-5 of Table P5
	Uses endline student survey.
	Regress non-cognitive skill on treatment status, controlling for randomization strata. 
	Cluster s.e.'s by school.
	Include sharpened q-values calculated above.*/
* prep missing values for students without baseline outcome
local Y0 "planned_schl_postsec_bl planned_occup_busorpro_bl planned_business_bl	control_avg_bl grit_raw_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

/*regressions: Columns 1-5*/
local stat ""1st stage F",F,"control mean",mu_c,"baseline mean",mu_0,"q-value",qv"
local Y "planned_occup_busorpro planned_business control_avg grit_raw"
local j=1 /*initialize loop*/

* Column 1: planned_schl_postsec
	qui ivregress 2sls planned_schl_postsec_el (exchange_pct=treatment) planned_schl_postsec_bli planned_schl_postsec_blm i.strata, cluster(school_code)
	qui estat first
	matrix define X=r(singleresults)
	scalar define F=X[1,4]
	qui su planned_schl_postsec_el if treatment==0
	scalar define mu_c=r(mean)
	qui su planned_schl_postsec_bl
	scalar define mu_0=r(mean)
	scalar define qv=H3_t5_c`j'q
	outreg2 exchange_pct using "$results/rwanda_jde_tableA10-A11_t5.xls", se excel nolabel nocons addstat(`stat') replace

* Columns 2 -5
foreach y in `Y' {
	qui ivregress 2sls `y'_el (exchange_pct=treatment) `y'_bli `y'_blm i.strata, cluster(school_code)
	qui estat first
	matrix define X=r(singleresults)
	scalar define F=X[1,4]
	qui su `y'_el if treatment==0
	scalar define mu_c=r(mean)
	qui su `y'_bl
	scalar define mu_0=r(mean)
	scalar define qv=H3_t5_c`j'q
	outreg2 exchange_pct using "$results/rwanda_jde_tableA10-A11_t5.xls", se excel nolabel nocons addstat(`stat') append
}


erase "$temp/studenttemp.dta"
local end=`"$S_TIME"' 
di "`start'"
di "`end'"
log close
